# load packages, installing if missing
if (!require(librarian)){
install.packages("librarian")
library(librarian)
}
librarian::shelf(
dismo, dplyr, DT, GGally, ggplot2, here, htmltools, leaflet, mapview, purrr, raster, readr, rgbif, rgdal, rJava, sdmpredictors, sf, spocc, tidyr,
caret, # m: modeling framework
dplyr, ggplot2 ,here, readr,
pdp, # X: partial dependence plots
ranger, # m: random forest modeling
rpart, # m: recursive partition modeling
rpart.plot, # m: recursive partition plotting
rsample, # d: split train/test data
skimr, # d: skim summarize data table
vip) # X: variable importance)
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = FALSE)
# set random seed for reproducibility
set.seed(42)
# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F, recursive = T)
Western black widow
obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo <- TRUE
if (!file.exists(obs_geo) | redo){
# get species occurrence data from GBIF with coordinates
(res <- spocc::occ(
# query = 'Bradypus variegatus', # sloth
# query = 'Phengaris alcon', # Alcon blue
# query = 'Camponotus pennsylvanicus', # ant
query = 'Latrodectus hesperus', # western black widow
from = 'gbif', has_coords = T,
limit = 10000))
# extract data frame from result
df <- res$gbif$data[[1]]
readr::write_csv(df, obs_csv)
# convert to points of observation from lon/lat columns in data frame
obs <- df %>%
sf::st_as_sf(
coords = c("longitude", "latitude"),
crs = st_crs(4326)) %>%
select(prov, key) # save space (joinable from obs_csv)
sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 4936
# show points on map
# mapview::mapview(obs, map.types = "Stamen.Terrain")
mapview::mapview(obs, map.types = "Stamen.Watercolor")
3,539 observations in GBIF
No data points were removed for this terrestrial species because there are no known data points on water.
dir_env <- file.path(dir_data, "env")
# set a default data directory
options(sdmpredictors_datadir = dir_env)
# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)
# show table of datasets
env_datasets %>%
select(dataset_code, description, citation) %>%
DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")
# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
What environmental layers did you choose as predictors? Can you find any support for these in the literature?
Altitude, temperate, terrain, and wetness. Western black widows are found in desert as well as mountainous regions. Although they prefer dark, dry conditions, they are able to survive in a variety of environments.
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", # altitude
"WC_bio1", # annual mean temp
"WC_bio2", # mean diurnal temp range
"ER_tri", # terrain roughness index
"ER_topoWet") # topographic wetness
# get layers
env_stack <- load_layers(env_layers_vec)
# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)
obs_hull_geo <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
if (!file.exists(obs_hull_geo) | redo){
# make convex hull around points of observation
obs_hull <- sf::st_convex_hull(st_union(obs))
# save obs hull
write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)
# show points on map
mapview(
list(obs, obs_hull))
if (!file.exists(env_stack_grd) | redo){
obs_hull_sp <- sf::as_Spatial(obs_hull)
env_stack <- raster::mask(env_stack, obs_hull_sp) %>%
raster::crop(extent(obs_hull_sp))
writeRaster(env_stack, env_stack_grd, overwrite=T)
}
env_stack <- stack(env_stack_grd)
# show map
# mapview(obs) +
# mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)
absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
if (!file.exists(absence_geo) | redo){
# get raster count of observations
r_obs <- rasterize(
sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
# show map
# mapview(obs) +
# mapview(r_obs)
# create mask for
r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
# generate random points inside mask
absence <- dismo::randomPoints(r_mask, nrow(obs)) %>%
as_tibble() %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)
# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") +
mapview(absence, col.regions = "gray")
if (!file.exists(pts_env_csv) | redo){
# combine presence and absence into single set of labeled points
pts <- rbind(
obs %>%
mutate(
present = 1) %>%
select(present, key),
absence %>%
mutate(
present = 0,
key = NA)) %>%
mutate(
ID = 1:n()) %>%
relocate(ID)
write_sf(pts, pts_geo, delete_dsn=T)
# extract raster values for points
pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>%
tibble() %>%
# join present and geometry columns to raster value results for points
left_join(
pts %>%
select(ID, present),
by = "ID") %>%
relocate(present, .after = ID) %>%
# extract lon, lat as single columns
mutate(
#present = factor(present),
lon = st_coordinates(geometry)[,1],
lat = st_coordinates(geometry)[,2]) %>%
select(-geometry)
write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)
pts_env %>%
# show first 10 presence, last 10 absence
slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>%
DT::datatable(
rownames = F,
options = list(
dom = "t",
pageLength = 20))
pts_env %>%
select(-ID) %>%
mutate(
present = factor(present)) %>%
pivot_longer(-present) %>%
ggplot() +
geom_density(aes(x = value, fill = present)) +
scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
theme_bw() +
facet_wrap(~name, scales = "free") +
theme(
legend.position = c(1, 0),
legend.justification = c(1, 0))
pts_env_csv <- file.path(dir_data, "pts_env.csv")
pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 9872
datatable(pts_env, rownames = F)
GGally::ggpairs(
select(pts_env, -ID),
aes(color = factor(present)))
# setup model data
d <- pts_env %>%
select(-ID) %>% # remove terms we don't want to model
tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 9847
# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
##
## Call:
## lm(formula = present ~ ., data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.01232 -0.12666 0.09589 0.19588 0.76372
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.758e-01 4.892e-02 -3.593 0.000328 ***
## WC_alt 5.522e-06 6.946e-06 0.795 0.426666
## WC_bio1 1.434e-02 7.628e-04 18.800 < 2e-16 ***
## WC_bio2 3.353e-02 1.513e-03 22.166 < 2e-16 ***
## ER_tri -1.394e-03 1.693e-04 -8.233 < 2e-16 ***
## ER_topoWet -5.572e-02 3.512e-03 -15.866 < 2e-16 ***
## lon -5.525e-03 7.452e-05 -74.132 < 2e-16 ***
## lat 9.038e-03 1.930e-04 46.830 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2982 on 9839 degrees of freedom
## Multiple R-squared: 0.6446, Adjusted R-squared: 0.6443
## F-statistic: 2549 on 7 and 9839 DF, p-value: < 2.2e-16
y_predict <- predict(mdl, d, type="response")
y_true <- d$present
range(y_predict)
## [1] -0.7637207 1.0563431
range(y_true)
## [1] 0 1
# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
##
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"),
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2295 -0.0018 0.1086 0.3591 3.8939
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.738e+01 1.450e+00 -18.885 < 2e-16 ***
## WC_alt 2.016e-03 1.679e-04 12.006 < 2e-16 ***
## WC_bio1 4.335e-01 3.587e-02 12.086 < 2e-16 ***
## WC_bio2 6.486e-04 2.961e-02 0.022 0.983
## ER_tri -1.983e-02 2.342e-03 -8.467 < 2e-16 ***
## ER_topoWet -2.361e-01 5.830e-02 -4.049 5.14e-05 ***
## lon -1.595e-01 6.720e-03 -23.732 < 2e-16 ***
## lat 1.996e-01 2.310e-02 8.640 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13650.8 on 9846 degrees of freedom
## Residual deviance: 3623.6 on 9839 degrees of freedom
## AIC: 3639.6
##
## Number of Fisher Scoring iterations: 9
y_predict <- predict(mdl, d, type="response")
range(y_predict)
## [1] 2.220446e-16 9.945647e-01
# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F, ylim="free")
librarian::shelf(mgcv)
# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
formula = present ~ s(WC_alt) + s(WC_bio1) +
s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat),
family = binomial, data = d)
summary(mdl)
##
## Family: binomial
## Link function: logit
##
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) +
## s(lon) + s(lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -280.2 1686.6 -0.166 0.868
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(WC_alt) 7.340 7.793 82.218 <2e-16 ***
## s(WC_bio1) 6.175 7.111 199.136 <2e-16 ***
## s(WC_bio2) 6.294 7.305 90.872 <2e-16 ***
## s(ER_tri) 2.562 3.318 10.441 0.018 *
## s(ER_topoWet) 1.001 1.003 0.004 0.959
## s(lon) 5.869 6.294 243.877 <2e-16 ***
## s(lat) 5.539 5.907 258.732 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.854 Deviance explained = 81.9%
## UBRE = -0.74126 Scale est. = 1 n = 9847
# show term plots
plot(mdl, scale=0)
Which GAM environmental variables, and even range of values, seem to contribute most towards presence (above 0 response) versus absence (below 0 response)?
# load extra packages
librarian::shelf(
maptools, sf)
mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")
# show version of maxent
if (!interactive())
maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)
# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>%
sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class
# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds)){
mdl <- maxent(env_stack, obs_sp)
readr::write_rds(mdl, mdl_maxent_rds)
}
mdl <- read_rds(mdl_maxent_rds)
# plot variable contributions per predictor
plot(mdl)
# plot term plots
response(mdl)
# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')
plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
Which Maxent environmental variables, and even range of values, seem to contribute most towards presence (closer to 1 response) and how might this differ from the GAM results?
# options
options(
scipen = 999,
readr.show_col_types = F)
set.seed(42)
# graphical theme
ggplot2::theme_set(ggplot2::theme_light())
# paths
dir_data <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
# read data
pts_env <- read_csv(pts_env_csv)
d <- pts_env %>%
select(-ID) %>% # not used as a predictor x
mutate(
present = factor(present)) %>% # categorical response
na.omit() # drop rows with NA
skim(d)
| Name | d |
| Number of rows | 9847 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| present | 0 | 1 | FALSE | 2 | 1: 4928, 0: 4919 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| WC_alt | 0 | 1 | 696.31 | 725.91 | -70.00 | 167.00 | 404.00 | 1054.00 | 5153.00 | ▇▂▁▁▁ |
| WC_bio1 | 0 | 1 | 16.91 | 7.00 | -4.80 | 11.60 | 16.60 | 23.10 | 30.60 | ▁▃▇▆▆ |
| WC_bio2 | 0 | 1 | 13.66 | 2.66 | 4.80 | 11.80 | 14.00 | 15.80 | 20.10 | ▁▃▆▇▂ |
| ER_tri | 0 | 1 | 29.62 | 35.17 | 0.00 | 5.88 | 15.68 | 42.40 | 312.27 | ▇▁▁▁▁ |
| ER_topoWet | 0 | 1 | 11.42 | 1.77 | 6.71 | 10.03 | 11.53 | 12.79 | 15.73 | ▂▆▇▇▂ |
| lon | 0 | 1 | -75.81 | 54.58 | -130.32 | -117.24 | -104.71 | -55.29 | 123.29 | ▇▂▂▁▁ |
| lat | 0 | 1 | 25.52 | 20.30 | -50.62 | 18.96 | 33.71 | 38.54 | 54.32 | ▁▁▂▅▇ |
# split data into training and testing
# create training set with 80% of full data
d_split <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train <- rsample::training(d_split)
# show number of rows present is 0 vs 1
table(d$present)
##
## 0 1
## 4919 4928
table(d_train$present)
##
## 0 1
## 3935 3942
# run decision stump model
mdl <- rpart(
present ~ ., data = d_train,
control = list(
cp = 0, minbucket = 5, maxdepth = 1))
mdl
## n= 7877
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 7877 3935 1 (0.499555668 0.500444332)
## 2) lon>=-96.97078 3436 13 0 (0.996216531 0.003783469) *
## 3) lon< -96.97078 4441 512 1 (0.115289349 0.884710651) *
# plot tree
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl)
# decision tree with defaults
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 7877
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 7877 3935 1 (0.499555668 0.500444332)
## 2) lon>=-96.97078 3436 13 0 (0.996216531 0.003783469) *
## 3) lon< -96.97078 4441 512 1 (0.115289349 0.884710651)
## 6) WC_bio1< 5.15 187 59 0 (0.684491979 0.315508021) *
## 7) WC_bio1>=5.15 4254 384 1 (0.090267983 0.909732017) *
rpart.plot(mdl)
# plot complexity parameter
plotcp(mdl)
# rpart cross validation results
mdl$cptable
## CP nsplit rel error xerror xstd
## 1 0.86658196 0 1.0000000 1.0259212 0.011273787
## 2 0.01753494 1 0.1334180 0.1341804 0.005640348
## 3 0.01000000 2 0.1158831 0.1176620 0.005305076
# caret cross validation results
mdl_caret <- train(
present ~ .,
data = d_train,
method = "rpart",
trControl = trainControl(method = "cv", number = 10),
tuneLength = 20)
ggplot(mdl_caret)
Based on the complexity plot threshold, what size of tree is recommended?
vip(mdl_caret, num_features = 40, bar = FALSE)
# Construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>%
plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE,
colorkey = TRUE, screen = list(z = -20, x = -60))
# Display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)
what are the top 3 most important variables of your model?
# number of features
n_features <- length(setdiff(names(d_train), "present"))
# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)
# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.1922056
# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
present ~ ., data = d_train,
importance = "impurity")
# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
present ~ ., data = d_train,
importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)
gridExtra::grid.arrange(p1, p2, nrow = 1)